####################
# collect legislation per year
# classify left right
# - policy area (general, economic, cultural)
# - manifesto (preamble - translated) (BERT) (general, economic, cultural)
# - llama (whole) (general, economic, cultural)
# - outputs: 3 data-sets
#     * original texts
#     * legislation scores - categories
#     * annual scores
#####################
# function to collect EurLex legislation by year
library(eurlex)
library(future.apply)
library(future.callr)
## directives
type <- "directive"
legismat <- eurlex::elx_run_query(eurlex::elx_make_query(type, include_force = FALSE,
                                                         include_date = TRUE, 
                                                         include_directory = TRUE))
legismat <- legismat |> 
  dplyr::arrange(date) 
legismat$year <- lubridate::year(legismat$date)

uri <- paste0("http://publications.europa.eu/resource/celex/", unique(legismat$celex))

plan("callr")

temp <- future_lapply(seq(uri), function(i) {
  title <- tryCatch(eurlex::elx_fetch_data(uri[i],type =c("title")),
                    error = function(e) {
                      message(paste0("Error: ", e))
                      return(NULL)
                    })
  text <-  tryCatch(eurlex::elx_fetch_data(uri[i],type =c("text")),
                    error = function(e) {
                      message(paste0("Error: ", e))
                      return(NULL)
                    })
  
  text <-   stringr::str_remove_all(text, "•") |> 
    stringr::str_remove_all("---pagebreak---") |> 
    stringr::str_remove_all("Official Journal of the European Communities") 
  
  text <- tokenizers::tokenize_paragraphs(text, )
  
  text <-  sapply(text, function(x) {
    x <- stringr::str_replace_all(x, "\n", " ")
    x <- stringr::str_squish(x) 
  })
  

  
out <-  list(title = title, date = legismat$date[i], year = legismat$year[i],
             type = legismat$type[i], celex = legismat$celex[i], directory = legismat$directory[i],
             work = legismat$work[i], url = uri[i], text = text)
writeLines(as.character(out), paste0("../data/legislation/directive_", i, ".txt"))
return(NULL)
})

## regulations
type <- "regulation"
legismat <- eurlex::elx_run_query(eurlex::elx_make_query(type, include_force = FALSE,
                                                         include_date = TRUE, 
                                                         include_directory = TRUE))
legismat <- legismat |> 
  dplyr::arrange(date) 
legismat$year <- lubridate::year(legismat$date)

uri <- paste0("http://publications.europa.eu/resource/celex/", legismat$celex)

temp <- future_lapply(seq(uri), function(i) {
  title <- tryCatch(eurlex::elx_fetch_data(uri[i],type =c("title")),
                    error = function(e) {
                      message(paste0("Error: ", e))
                      return(NULL)
                    })
  text <-  tryCatch(eurlex::elx_fetch_data(uri[i],type =c("text")),
                    error = function(e) {
                      message(paste0("Error: ", e))
                      return(NULL)
                    })
  
  text <-   stringr::str_remove_all(text, "•") |> 
    stringr::str_remove_all("---pagebreak---") |> 
    stringr::str_remove_all("Official Journal of the European Communities") 
  
  text <-  sapply(text, function(x) {
    x <- stringr::str_replace_all(x, "\n", " ")
    x <- stringr::str_squish(x) 
  })
  
  
  out <-  list(title = title, date = legismat$date[i], year = legismat$year[i],
               type = legismat$type[i], celex = legismat$celex[i], directory = legismat$directory[i],
               work = legismat$work[i], url = uri[i], text = text)
  writeLines(as.character(out), paste0("../data/legislation/regulation_", i, ".txt"))
  return(NULL)
})

## decisions
type <- "decision"
legismat <- eurlex::elx_run_query(eurlex::elx_make_query(type, include_force = FALSE,
                                                         include_date = TRUE, 
                                                         include_directory = TRUE))
legismat <- legismat |> 
  dplyr::arrange(date) 
legismat$year <- lubridate::year(legismat$date)

uri <- paste0("http://publications.europa.eu/resource/celex/", legismat$celex)

temp <- future_lapply(seq(uri), function(i) {
  title <- tryCatch(eurlex::elx_fetch_data(uri[i],type =c("title")),
                    error = function(e) {
                      message(paste0("Error: ", e))
                      return(NULL)
                    })
  text <-  tryCatch(eurlex::elx_fetch_data(uri[i],type =c("text")),
                    error = function(e) {
                      message(paste0("Error: ", e))
                      return(NULL)
                    })
  
  text <-   stringr::str_remove_all(text, "•") |> 
    stringr::str_remove_all("---pagebreak---") |> 
    stringr::str_remove_all("Official Journal of the European Communities") 
  
  text <-  sapply(text, function(x) {
    x <- stringr::str_replace_all(x, "\n", " ")
    x <- stringr::str_squish(x) 
  })
  
  
  out <-  list(title = title, date = legismat$date[i], year = legismat$year[i],
               type = legismat$type[i], celex = legismat$celex[i], directory = legismat$directory[i],
               work = legismat$work[i], url = uri[i], text = text)
  writeLines(as.character(out), paste0("../data/legislation/decision_", i, ".txt"))
  return(NULL)
})
